import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedShuffleSplit, ShuffleSplit
# preprocessing
from sklearn import preprocessing
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
from matplotlib.font_manager import FontProperties
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
Anomaly detection is a classification process in which rare items, events, or observations in data sets are identified. Learn more about this here. In this article, we investigate Credit Card Fraud Detection dataset from Kaggle.com.
Credit card companies must be able to recognize fraudulent credit card transactions so that customers are not charged for items that they did not purchase.
The datasets contain transactions made by credit cards in September 2013 by European cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions. It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-sensitive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.
Path ='Data/creditcard.csv'
Data = pd.read_csv(Path, sep=',')
Labels = ['Normal', 'Fraud']
Target = 'Class'
Col = []
# Temp = re.findall("(\d+)", s)
for s in Data.columns:
if any(map(str.isdigit, s)) == True:
Temp = s.split('V')
Col.append('V'+ Temp[-1].zfill(2))
else:
Col.append(s)
Data.columns = Col
del Col
display(pd.DataFrame(Data.shape, columns = ['Count'], index = ['Attributes', 'Instances']).T)
def Data_info(Inp, Only_NaN = False):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out ['Size'] = Inp.shape[0]
Out['Percentage'] = 100 - np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
Out.index.name = 'Features'
Out['Data Type'] = Out['Data Type'].astype(str)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
data_info = Data_info(Data).reset_index(drop = False)
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type', text = 'Data Type',
color_discrete_sequence = ['PaleGreen', 'LightBlue', 'PeachPuff'], hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1), height = 400, width = 980)
fig.update_traces(texttemplate= 6*' ' + '%{label}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
fig.show()
| Attributes | Instances | |
|---|---|---|
| Count | 284807 | 31 |
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
_ = ax.hist(Data.loc[Data.Class == 0, 'Amount'], 100, color = '#34495e', hatch = '/', lw = 1.5,
edgecolor = '#3498db', label = Labels[0])
_ = ax.hist(Data.loc[Data.Class == 1, 'Amount'], 10, Color = '#e74c3c', hatch = '\\', lw = 1.5,
edgecolor = 'DarkRed', label = Labels[1])
_ = ax.set_xlabel('Amount')
_ = ax.set_ylabel('Frequency (Logarithm Scale)')
_ = ax.set_xlim([0, 2e4])
_ = ax.set_yscale('log')
_ = ax.set_ylim([0, 1e6])
_ = ax.legend(bbox_to_anchor=(1, 1), fontsize=14, ncol=2)
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
_ = ax.scatter(Data.loc[Data.Class == 0, 'Time'], Data.loc[Data.Class == 0, 'Amount'], s= 30,
facecolors='SkyBlue', edgecolors='MidnightBlue', alpha = 0.8, label = Labels[0])
_ = ax.scatter(Data.loc[Data.Class == 1, 'Time'], Data.loc[Data.Class == 1, 'Amount'], s= 30,
facecolors='Orange', edgecolors='DarkRed', alpha = 1, label = Labels[1])
_ = ax.set_xlabel('Time (in seconds)')
_ = ax.set_ylabel('Amount')
_ = ax.set_xlim([-500, Data.Time.max()+500])
_ = ax.set_ylim([-250, 2e4])
_ = ax.legend(bbox_to_anchor=(1, 1), fontsize=14, ncol=2)
Let's look that transaction class distribution
def Dist_Table(Inp = Data, Target = Target):
Table = Inp[Target].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(dict(zip([0,1],Labels)))
Table['Percentage'] = 100 - np.round(100*(Table['Count']/Table['Count'].sum()),2)
return Table
Table = Dist_Table()
def Dist_Plot(Table, PieColors = ['SeaGreen', 'FireBrick'], TableColors = ['Navy','White']):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=[0.6, 0.4],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values, pull=[0, 0.1], textfont=dict(size=16),
marker=dict(colors = PieColors, line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=.5)
fig.update_layout(height = 400, legend=dict(orientation="v"), legend_title_text= Target,
annotations=[dict(text= '<b>' + Target + '<b>', x=0.835, y=0.5, font_size=14, showarrow=False)])
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= TableColors[0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = [0.2, 0.2, 0.2],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + 'Distribution' + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Dist_Plot(Table)
The Dataset is quite large, we would like to use pandas DataFrame sample feature with using a one-tenth of the data as a sample.
df= Data.sample(frac = 0.1, random_state=1).reset_index(drop = True)
Dist_Plot(Dist_Table(df), PieColors = ['CornflowerBlue', 'OrangeRed'], TableColors = ['Purple','Lavenderblush'])
First off, let's define $X$ and $y$ sets.
X = df.drop(columns = [Target])
y = df[Target]
Moreover, high variance for some features can hurt our modeling process. For this reason, we would like to standardize features by removing the mean and scaling to unit variance. In this article, we demonstrated the benefits of scaling data using StandardScaler().
# scaling data
scaler = preprocessing.StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(data = X_std, columns =X.columns)
del scaler
fig, ax = plt.subplots(2, 1, figsize=(18, 8))
ax = ax.ravel()
font = FontProperties()
font.set_weight('bold')
CP = [sns.color_palette("OrRd", 20), sns.color_palette("Greens", 20)]
Names = ['Variance of the Features', 'Variance of the Features (Standardized)']
Sets = [X, X_std]
kws = dict(label='Feature\nVariance', aspect=20, shrink= .3)
for i in range(len(ax)):
Temp = Sets[i].var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax[i], annot=True, square=True, cmap = CP[i],
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": 6},
cbar_kws=kws)
_ = ax[i].set_yticklabels('')
_ = ax[i].set_title(Names[i], fontproperties=font, fontsize = 16)
del Temp
X = X_std.copy()
del CP, Names, ax, fig, font, Sets, kws,
fig, ax = plt.subplots(figsize=(17,20))
Temp = pd.concat([X, y], axis = 1)
Temp = Temp.corr().round(2)
Temp = Temp.loc[(Temp.index == Target)].drop(columns = Target).T.sort_values(by = Target).T
_ = sns.heatmap(Temp, ax=ax, annot=True, square=True, cmap =sns.color_palette("Greens", n_colors=10),
linewidths = 0.8, vmin=0, vmax=1,
annot_kws={"size": 12},
cbar_kws={'label': Target + ' Correlation', "aspect":40, "shrink": .4, "orientation": "horizontal"})
_ = ax.set_yticklabels('')
del Temp
Modifying dataset.
df[X.columns.tolist()] = X_std[X.columns.tolist()]
df.to_csv (Path.split(".")[0]+'_STD.csv', index = None, header=True)
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
X_train, X_test = X.loc[train_index], X.loc[test_index]
y_train, y_test = y[train_index], y[test_index]
del sss
Colors = ['SeaGreen', 'FireBrick']
nc = 2
fig = make_subplots(rows=1, cols=nc, specs=[[{'type':'domain'}]*nc])
fig.add_trace(go.Pie(labels=Labels,
values=y_train.value_counts().values,
pull=[0, 0.1],
name= 'Train Set',
textfont=dict(size=16),
marker= dict(colors = Colors, line=dict(color='black', width=1))), 1, 1)
fig.add_trace(go.Pie(labels=Labels,
values=y_test.value_counts().values,
pull=[0, 0.1],
name= 'Test Set',
textfont=dict(size=16),
marker= dict(colors = Colors, line=dict(color='black', width=1))), 1, 2)
fig.update_traces(hole=.5)
fig.update_layout(height = 400, legend=dict(orientation="v"),
legend_title_text= Target,
annotations=[dict(text= '<b>' + 'Train<br>Set' + '<b>', x=0.195, y=0.5, font_size=14, showarrow=False),
dict(text= '<b>' + 'Test<br>Set' + '<b>', x=0.8, y=0.5, font_size=14, showarrow=False)],
title={'text': '<b>' + Target + '<b>', 'x':0.48, 'y': .83, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
This model optimizes the log-loss function using LBFGS or stochastic gradient descent. See sklearn.neural_network.MLPClassifier.
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Scoring(model, X, y, n_splits = 20, RS = 42):
kfold = KFold(n_splits= n_splits, random_state= RS, shuffle = True)
ROC = cross_val_score(model, X, y, cv=kfold, scoring = 'roc_auc')
bACC = cross_val_score(model, X, y, cv=kfold, scoring = 'balanced_accuracy')
ROC = ROC[np.logical_not(np.isnan(ROC))]
bACC[np.logical_not(np.isnan(bACC))]
return ROC, bACC
def Performance_Table(model, X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test):
Cols = ['Set', 'ROC Accuracy', 'Balanced Accuracy']
ROC, bACC = Scoring(model, X = X_train, y = y_train)
data = ['Train Set', ('%.4f' % ROC.mean())+ ' ± ' + ('%.4f' % ROC.std()),
('%.4f' % bACC.mean())+ ' ± ' + ('%.4f' % bACC.std())]
Out = pd.DataFrame(data = data, index = Cols).T
ROC, bACC = Scoring(model, X = X_test, y = y_test)
data = ['Test Set', ('%.4f' % ROC.mean())+ ' ± ' + ('%.4f' % ROC.std()),
('%.4f' % bACC.mean())+ ' ± ' + ('%.4f' % bACC.std())]
Temp = pd.DataFrame(data = data, index = Cols).T
Out = pd.concat([Out, Temp]).reset_index(drop = True)
return Out
def Classification_Report_CV(model, X, y, n_splits = 20, CM_method = 'Sum'):
Reports = []
CM = []
def classification_report_with_accuracy_score(y_true, y_pred):
Reports.append(pd.DataFrame(metrics.classification_report(y_true, y_pred,
target_names = Labels, output_dict = True)).T.values)
CM.append(metrics.confusion_matrix(y_true, y_pred))
return metrics.accuracy_score(y_true, y_pred)
cross_val_score(model, X=X, y=y, cv=StratifiedShuffleSplit(n_splits=n_splits, random_state=42),\
scoring=metrics.make_scorer(classification_report_with_accuracy_score))
Reports_All = Reports[0].ravel()
CM_All = CM[0].ravel()
for i in range(1, len(Reports)):
Reports_All = np.vstack((Reports_All, Reports[i].ravel()))
for i in range(1, len(CM)):
CM_All = np.vstack((CM_All, CM[i].ravel()))
R = pd.DataFrame(metrics.classification_report(y_train, model.predict(X_train),
target_names = Labels, output_dict = True)).T
Mean = pd.DataFrame(Reports_All.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(Reports_All.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
if CM_method == 'Sum':
CM = CM_All.mean(axis = 0).reshape(CM[0].shape).round(0).astype(int)
else:
CM = CM_All.sum(axis = 0).reshape(CM[0].shape).round(0).astype(int)
Reports.index.name = 'CV = % i' % n_splits
return CM, Reports
def Confusion_Mat(CM_Train, CM_Test, n_splits = 20):
# Font
font = FontProperties()
font.set_weight('bold')
Titles = ['Train Set (CV = % i)' % n_splits, 'Test Set (CV = % i)' % n_splits]
CM = [CM_Train, CM_Test]
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
fig.suptitle(Titles[i], fontproperties=font, fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": 1})
_ = ax[0].set_title('Confusion Matrix');
_ = sns.heatmap(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis],
annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(Labels)
_ = a.yaxis.set_ticklabels(Labels)
_ = a.set_aspect(1)
def Tables_and_Plots(model, X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
n_splits = 20, CM_method = 'Sum'):
display(Performance_Table(model).style.hide_index())
Header('Train Set', C = 'Green')
CM_Train, R_Train = Classification_Report_CV(model, X=X_train, y=y_train, n_splits = n_splits, CM_method = CM_method)
display(R_Train)
Header('Test Set', C = 'Red')
CM_Test, R_Test = Classification_Report_CV(model, X=X_test, y=y_test, n_splits = n_splits, CM_method = CM_method)
display(R_Test)
Line()
Confusion_Mat(CM_Train, CM_Test, n_splits = n_splits)
def Grid_Table(grid):
Temp = [str(x) for x in grid.cv_results_['params']]
Temp = [s.replace('{', '').replace('}', '').replace("'", '') for s in Temp]
Table = pd.DataFrame({'rank_test_score': grid.cv_results_['rank_test_score'],
'params':Temp,
'mean_test_score': grid.cv_results_['mean_test_score'],
'mean_fit_time': grid.cv_results_['mean_fit_time']})
Table = Table.round(4).sort_values('rank_test_score').set_index('rank_test_score')
return Table
def Grid_Performance_Plot(Table):
font = FontProperties()
font.set_weight('bold')
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
Z = zip(axes, ['mean_test_score', 'mean_fit_time'], ['Blue', 'Red'],['Classification Accuracy', 'Fit Time (with caching)'])
for ax, col, c, title in Z:
_ = ax.errorbar(x = Table['params'], y = Table[col], yerr = Table[col], color = c)
_ = ax.set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = ax.set_ylim(bottom = 0)
_ = ax.set_xlabel('Paramerers')
_ = ax.set_title(title, fontproperties=font, fontsize = 14)
def Best_Parm(model, param_dist, Top = None,
X_train = X_train, y_train= y_train, X_test = X_test, y_test = y_test):
grid = RandomizedSearchCV(estimator = model, param_distributions = param_dist,
cv = KFold(n_splits = 20, shuffle = True),
n_iter = int(1e3),
scoring = 'precision',
error_score = 0,
verbose = 0,
n_jobs = 10,
return_train_score = True)
_ = grid.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid.best_score_],
'Best Paramerers': [str(grid.best_params_)],
'Precision': [grid.score(X_test,y_test)]}).round(4).style.hide_index().set_precision(4))
Table = Grid_Table(grid)
if Top == None:
Top = Table.shape[0]
display(Table.reset_index(drop = False).head(Top).style.hide_index().\
set_precision(4).background_gradient(subset= ['mean_test_score'], cmap='Greens').\
background_gradient(subset= ['mean_fit_time'], cmap='Oranges'))
Grid_Performance_Plot(Table)
return grid
Some of the metrics that we use here to mesure the accuracy: \begin{align} \text{Confusion Matrix} = \begin{bmatrix}T_p & F_p\\ F_n & T_n\end{bmatrix}. \end{align}
where $T_p$, $T_n$, $F_p$, and $F_n$ represent true positive, true negative, false positive, and false negative, respectively.
\begin{align} \text{Precision} &= \frac{T_{p}}{T_{p} + F_{p}},\\ \text{Recall} &= \frac{T_{p}}{T_{p} + F_{n}},\\ \text{F1} &= \frac{2 \times \text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}\\ \text{Balanced-Accuracy (bACC)} &= \frac{1}{2}\left( \frac{T_{p}}{T_{p} + F_{n}} + \frac{T_{n}}{T_{n} + F_{p}}\right ) \end{align}The accuracy can be a misleading metric for imbalanced data sets. In these cases, a balanced accuracy (bACC) [6] is recommended that normalizes true positive and true negative predictions by the number of positive and negative samples, respectively, and divides their sum by two.
Header('MLP with Default Parameters')
MLP = MLPClassifier(max_iter = 1000, verbose= False)
print('Default Parameters = %s' % MLP.get_params(deep=True))
_ = MLP.fit(X_train, y_train)
Tables_and_Plots(MLP, n_splits = 20)
MLP with Default Parameters ======================================================================== Default Parameters = {'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 1000, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}
| Set | ROC Accuracy | Balanced Accuracy |
|---|---|---|
| Train Set | 0.9207 ± 0.1425 | 0.8245 ± 0.1986 |
| Test Set | 0.9267 ± 0.1695 | 0.8875 ± 0.2012 |
Train Set ==========================================================================================
| precision | recall | f1-score | support | |
|---|---|---|---|---|
| CV = 20 | ||||
| Normal | 0.9993 ± 0.0005 | 0.9999 ± 0.0002 | 0.9996 ± 0.0003 | 1991.0000 ± 0.0000 |
| Fraud | 0.8250 ± 0.3631 | 0.5667 ± 0.3180 | 0.6550 ± 0.3232 | 3.0000 ± 0.0000 |
| accuracy | 0.9993 ± 0.0006 | 0.9993 ± 0.0006 | 0.9993 ± 0.0006 | 0.9993 ± 0.0006 |
| macro avg | 0.9122 ± 0.1818 | 0.7833 ± 0.1590 | 0.8273 ± 0.1617 | 1994.0000 ± 0.0000 |
| weighted avg | 0.9991 ± 0.0010 | 0.9993 ± 0.0006 | 0.9991 ± 0.0008 | 1994.0000 ± 0.0000 |
Test Set ===========================================================================================
| precision | recall | f1-score | support | |
|---|---|---|---|---|
| CV = 20 | ||||
| Normal | 0.9991 ± 0.0007 | 0.9999 ± 0.0003 | 0.9995 ± 0.0004 | 853.0000 ± 0.0000 |
| Fraud | 0.8750 ± 0.3112 | 0.6000 ± 0.3000 | 0.6917 ± 0.2803 | 2.0000 ± 0.0000 |
| accuracy | 0.9990 ± 0.0008 | 0.9990 ± 0.0008 | 0.9990 ± 0.0008 | 0.9990 ± 0.0008 |
| macro avg | 0.9370 ± 0.1559 | 0.8000 ± 0.1500 | 0.8456 ± 0.1403 | 855.0000 ± 0.0000 |
| weighted avg | 0.9988 ± 0.0013 | 0.9990 ± 0.0008 | 0.9988 ± 0.0010 | 855.0000 ± 0.0000 |
====================================================================================================
param_dist = {'solver': ['lbfgs', 'sgd', 'adam'],
'alpha': [10.0**x for x in np.arange(-1,-4,-1)],
'learning_rate' : ['constant', 'invscaling', 'adaptive']}
Header('MLP with the Best Parameters')
grid_model = Best_Parm(model = MLP, param_dist = param_dist)
MLP with the Best Parameters =======================================================================
| Best Score | Best Paramerers | Precision |
|---|---|---|
| 0.5250 | {'solver': 'lbfgs', 'learning_rate': 'constant', 'alpha': 0.1} | 0.8571 |
| rank_test_score | params | mean_test_score | mean_fit_time |
|---|---|---|---|
| 1 | solver: lbfgs, learning_rate: constant, alpha: 0.1 | 0.5250 | 4.1000 |
| 1 | solver: adam, learning_rate: constant, alpha: 0.1 | 0.5250 | 5.8269 |
| 1 | solver: adam, learning_rate: constant, alpha: 0.001 | 0.5250 | 4.1252 |
| 1 | solver: adam, learning_rate: adaptive, alpha: 0.01 | 0.5250 | 4.9430 |
| 1 | solver: adam, learning_rate: invscaling, alpha: 0.1 | 0.5250 | 5.8354 |
| 1 | solver: lbfgs, learning_rate: adaptive, alpha: 0.1 | 0.5250 | 3.0492 |
| 1 | solver: adam, learning_rate: adaptive, alpha: 0.1 | 0.5250 | 5.9839 |
| 8 | solver: lbfgs, learning_rate: adaptive, alpha: 0.01 | 0.5167 | 3.4216 |
| 9 | solver: adam, learning_rate: invscaling, alpha: 0.001 | 0.5000 | 3.8667 |
| 9 | solver: adam, learning_rate: constant, alpha: 0.01 | 0.5000 | 5.4050 |
| 9 | solver: adam, learning_rate: adaptive, alpha: 0.001 | 0.5000 | 3.8014 |
| 12 | solver: lbfgs, learning_rate: invscaling, alpha: 0.01 | 0.4900 | 3.1442 |
| 13 | solver: adam, learning_rate: invscaling, alpha: 0.01 | 0.4750 | 5.2985 |
| 13 | solver: sgd, learning_rate: adaptive, alpha: 0.1 | 0.4750 | 9.7054 |
| 13 | solver: lbfgs, learning_rate: invscaling, alpha: 0.1 | 0.4750 | 3.3720 |
| 16 | solver: lbfgs, learning_rate: constant, alpha: 0.01 | 0.4667 | 3.4136 |
| 17 | solver: lbfgs, learning_rate: constant, alpha: 0.001 | 0.4667 | 3.1282 |
| 17 | solver: lbfgs, learning_rate: invscaling, alpha: 0.001 | 0.4667 | 3.0717 |
| 19 | solver: sgd, learning_rate: adaptive, alpha: 0.01 | 0.4250 | 9.4522 |
| 20 | solver: lbfgs, learning_rate: adaptive, alpha: 0.001 | 0.4233 | 3.2399 |
| 21 | solver: sgd, learning_rate: adaptive, alpha: 0.001 | 0.3750 | 9.1187 |
| 21 | solver: sgd, learning_rate: constant, alpha: 0.001 | 0.3750 | 4.6676 |
| 21 | solver: sgd, learning_rate: constant, alpha: 0.1 | 0.3750 | 5.1398 |
| 21 | solver: sgd, learning_rate: constant, alpha: 0.01 | 0.3750 | 4.9608 |
| 25 | solver: sgd, learning_rate: invscaling, alpha: 0.1 | 0.0000 | 5.7992 |
| 25 | solver: sgd, learning_rate: invscaling, alpha: 0.001 | 0.0000 | 5.1013 |
| 25 | solver: sgd, learning_rate: invscaling, alpha: 0.01 | 0.0000 | 5.2963 |
Since we have identified the best parameters for our modeling, we train another model using these parameters.
Header('MLP with the Best Parameters')
MLP = MLPClassifier(max_iter = 1000, alpha = grid_model.best_params_['alpha'],
learning_rate = grid_model.best_params_['learning_rate'],
solver = grid_model.best_params_['solver'], verbose= False)
print('Best Parameters = %s' % MLP.get_params(deep=True))
_ = MLP.fit(X_train, y_train)
Tables_and_Plots(MLP, n_splits = 20)
MLP with the Best Parameters ======================================================================= Best Parameters = {'activation': 'relu', 'alpha': 0.1, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 1000, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'lbfgs', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}
| Set | ROC Accuracy | Balanced Accuracy |
|---|---|---|
| Train Set | 0.9499 ± 0.0904 | 0.8670 ± 0.1855 |
| Test Set | 0.9335 ± 0.1725 | 0.9625 ± 0.1192 |
Train Set ==========================================================================================
| precision | recall | f1-score | support | |
|---|---|---|---|---|
| CV = 20 | ||||
| Normal | 0.9995 ± 0.0004 | 0.9998 ± 0.0002 | 0.9997 ± 0.0003 | 1991.0000 ± 0.0000 |
| Fraud | 0.8250 ± 0.2594 | 0.7000 ± 0.2963 | 0.7357 ± 0.2605 | 3.0000 ± 0.0000 |
| accuracy | 0.9994 ± 0.0005 | 0.9994 ± 0.0005 | 0.9994 ± 0.0005 | 0.9994 ± 0.0005 |
| macro avg | 0.9123 ± 0.1298 | 0.8499 ± 0.1481 | 0.8677 ± 0.1304 | 1994.0000 ± 0.0000 |
| weighted avg | 0.9993 ± 0.0008 | 0.9994 ± 0.0005 | 0.9993 ± 0.0006 | 1994.0000 ± 0.0000 |
Test Set ===========================================================================================
| precision | recall | f1-score | support | |
|---|---|---|---|---|
| CV = 20 | ||||
| Normal | 0.9994 ± 0.0007 | 0.9999 ± 0.0003 | 0.9997 ± 0.0004 | 853.0000 ± 0.0000 |
| Fraud | 0.9250 ± 0.2385 | 0.7500 ± 0.2958 | 0.8083 ± 0.2543 | 2.0000 ± 0.0000 |
| accuracy | 0.9994 ± 0.0008 | 0.9994 ± 0.0008 | 0.9994 ± 0.0008 | 0.9994 ± 0.0008 |
| macro avg | 0.9622 ± 0.1195 | 0.8750 ± 0.1479 | 0.9040 ± 0.1273 | 855.0000 ± 0.0000 |
| weighted avg | 0.9992 ± 0.0011 | 0.9994 ± 0.0008 | 0.9992 ± 0.0010 | 855.0000 ± 0.0000 |
====================================================================================================
In the next article, we try to improve these results using PyTorch MLP.